In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Using Pipelines with Grid-Search

Feature selection and regression without pipelines


In [ ]:
from sklearn.datasets import make_regression

X, y = make_regression(random_state=42, effective_rank=90)
print(X.shape)

In [ ]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=.5)

In [ ]:
from sklearn.feature_selection import SelectFpr, f_regression
from sklearn.linear_model import Ridge

fpr = SelectFpr(score_func=f_regression)
fpr.fit(X_train, y_train)
X_train_fpr = fpr.transform(X_train)
X_test_fpr = fpr.transform(X_test)

print(X_train_fpr.shape)

In [ ]:
ridge = Ridge()
ridge.fit(X_train_fpr, y_train)
ridge.score(X_test_fpr, y_test)

With pipelines


In [ ]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(SelectFpr(score_func=f_regression), Ridge())

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

Grid-Searching alpha in Ridge


In [ ]:
from sklearn.grid_search import GridSearchCV
# without pipeline:
param_grid_no_pipeline = {'alpha': 10. ** np.arange(-3, 5)}

In [ ]:
pipe.named_steps.keys()

In [ ]:
# with pipeline
param_grid = {'ridge__alpha': 10. ** np.arange(-3, 5)}
grid = GridSearchCV(pipe, param_grid, cv=10)
grid.fit(X_train, y_train)

In [ ]:
grid.score(X_test, y_test)

In [ ]:
grid.best_params_

Selecting parameters of the preprocessing steps


In [ ]:
param_grid = {'ridge__alpha': 10. ** np.arange(-3, 5),
              'selectfpr__alpha': [0.01, 0.02, 0.05, 0.1, 0.3]}
grid = GridSearchCV(pipe, param_grid, cv=10)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)

In [ ]:
grid.best_params_

In [ ]:
final_selectfpr = grid.best_estimator_.named_steps['selectfpr']
final_selectfpr.get_support()

In [ ]: